# Assign various spatial characteristics for homes
pacman::p_load(raster, tidyverse, fst, modelsummary, mapview, sf, nngeo, qs, data.table)
sf_use_s2(FALSE)

source("code/globals.R")

# Load statewide sample
homes <- read_fst(file.path(WORKING, "ztraxdata", "CAattr2018.fst"),
  columns = c("ImportParcelID", "BuildingOrImprovementNumber", "FIPS", "State", "PropertyFullStreetAddress", "PropertyHouseNumber", "PropertyStreetName", "PropertyStreetSuffix", "PropertyCity", "PropertyState", "PropertyZip", "PropertyAddressLongitude", "PropertyAddressLatitude", "LotSizeAcres", "YearBuilt", "EffectiveYearBuilt", "TotalBedrooms", "PropertyLandUseStndCode", "sqfeet", "TotalAssessedValue"),
  to = NULL
)


# Apply the same filters we used prior to geocoding
homes <- homes %>%
  filter(PropertyLandUseStndCode %in% c("RR101", "RR102")) %>%
  filter(BuildingOrImprovementNumber == 1) %>%
  filter(!is.na(PropertyAddressLongitude) & !is.na(PropertyAddressLatitude))


# Create a combinedyear built that reflects remodels
# As for the main sample, use effective year built if it is newer than year built, year built otherwise
homes <- homes %>%
  mutate(combinedyear = case_when(
    !is.na(EffectiveYearBuilt) & EffectiveYearBuilt > YearBuilt ~ EffectiveYearBuilt,
    !is.na(YearBuilt) ~ YearBuilt,
    is.na(YearBuilt) & !is.na(EffectiveYearBuilt) ~ EffectiveYearBuilt
  )) %>%
  mutate(combinedyear = case_when(
    is.na(combinedyear) | combinedyear < 1900 ~ 1900, # Important: if combinedyear is missing, we assume pre-codes
    TRUE ~ combinedyear
  ))

# Load geocoding from ArcGIS
statewide_geocoded <- read_fst(file.path(WORKING, "statewide-geocoded.fst"), as.data.table = T)

homes <- homes %>% left_join(statewide_geocoded, by = c("ImportParcelID", "BuildingOrImprovementNumber"))

# Assign spatial locations based on best available location fields
# Use ArcGIS if available, ZTRAX if not
homes <- homes %>%
  mutate(
    lon = case_when(
      !is.na(lon_ESRI) ~ lon_ESRI,
      !is.na(PropertyAddressLongitude) ~ PropertyAddressLongitude,
      TRUE ~ NA_real_
    ),
    lat = case_when(
      !is.na(lat_ESRI) ~ lat_ESRI,
      !is.na(PropertyAddressLatitude) ~ PropertyAddressLatitude,
      TRUE ~ NA_real_
    ),
    location_source = case_when(
      !is.na(lon_ESRI) ~ "ESRI",
      !is.na(PropertyAddressLongitude) ~ "ZTRAX"
    )
  )

homes <- homes %>%
  st_as_sf(coords = c("lon", "lat"), crs = 4326, remove = F) %>%
  st_transform(3310) # CA Albers (in meters)

# Assign SRA / FHSZ information (see globals.R for the defn of this function)
homes <- get_SRA_FHSZ_status(homes)

# Drop unneeded variables
homes <- homes %>%
  select(-c(SRA_1990, SRA_1996, SRA_2003, SRA_2005, SRA_2010, SRA_2011, SRA_2012, SRA_2013, SRA_2014, SRA_2015, SRA_2016, SRA_2017, SRA_2018, SRA_2019, SRA_2020, hazclass_sra_85, hazclass_sra_07, LRAFHSZ_1998, hazclass_lra_98, LRAFHSZ_2008, hazclass_lra_08, vh_sra_85, vh_sra_07))

# Assign information from WRC (see globals.R for the defn of this function)
# We compute this variable earlier for choosing what to geocode, but now we recompute with the improved locations -- differences should be relatively minor in most cases
homes <- assign_wildfire_hazard(homes)

# Assign other spatial information
homes <- assign_spatial(homes)

# Compute neighbors
# This is quite fast as long as I make sure projection is in meters
neighbor_mtx_sparse <- st_nn(homes %>% select("ImportParcelID", "BuildingOrImprovementNumber"),
  homes %>% select("ImportParcelID", "BuildingOrImprovementNumber", "sqfeet"),
  k = 8, # Cap at 8, since any more than this is implausible - can do a robustness check with < 7 to make sure this isn't impactful (doesn't change for many homes)
  maxdist = 30
) # Compute centroid distances out to 30m -- similar to 10m wall-to-wall

# Neighbor-mapping should be a data frame mapping each home to itself and enigbhors
neighbor_mapping <- lapply(neighbor_mtx_sparse, function(x) {
  data.frame(neighbor_idx = x)
}) %>%
  bind_rows(.id = "self_idx") %>%
  mutate(self_idx = as.numeric(self_idx))

# Save neighbor square footage
neighbor_mapping$sqfeet_nbr <- homes$sqfeet[neighbor_mapping$neighbor_idx]

# For each home, compute counts average neighbor square footage (excluding self)
setDT(neighbor_mapping)

neighbor_counts <- neighbor_mapping[, .(neighbors_in_30m_centroid = .N - 1), by = self_idx]
neighbor_sqft <- neighbor_mapping[self_idx != neighbor_idx,
  .(NeighborSqFt = mean(sqfeet_nbr, na.rm = T)),
  by = self_idx
]
neighbor_chars <- merge(neighbor_counts, neighbor_sqft, by = "self_idx", all.x = T)

homes$self_idx <- 1:nrow(homes)

homes <- merge(homes, neighbor_chars, by = "self_idx", all.x = T)

# Convert FIPS to numeric with 0 padding
homes <- homes %>% mutate(FIPS = sprintf("%05d", FIPS))


# Construct equivalent of street FE
homes <- homes %>%
  mutate(
    street = paste(PropertyStreetName, PropertyStreetSuffix, FIPS),
    sqfeet1000 = sqfeet / 1000
  )

# Substreets group every 25 homes within a street
homes <- homes %>%
  group_by(street) %>%
  mutate(street_id = 1:n()) %>%
  mutate(street25 = paste(street, ceiling(street_id / 25))) %>%
  ungroup()

# Construct period variable based on year built or year of major remodel
homes <- homes %>%
  mutate(period = factor(case_when(
    combinedyear < 1997 ~ 0,
    combinedyear >= 1998 & combinedyear < 2008 ~ 1,
    combinedyear >= 2008 ~ 2
  )))

# Define regime (follows main data construction of regime)
homes <- homes %>%
  mutate(regime = case_when(
    SRA == "SRA" ~ "sra",
    SRA == "FRA" ~ "fra",
    SRA != "SRA" & vh_lra_98 == TRUE & vh_lra_08 == TRUE ~ "lraYY",
    SRA != "SRA" & vh_lra_98 == FALSE & vh_lra_08 == TRUE ~ "lraNY",
    SRA != "SRA" & vh_lra_98 == TRUE & vh_lra_08 == FALSE ~ "lraYN",
    SRA != "SRA" & vh_lra_98 == FALSE & vh_lra_08 == FALSE ~ "lraNN"
  ))

# Construct treatment group
homes <- homes %>%
  mutate(treatmentgroup = case_when(
    regime %in% c("otherstates", "lraNN") ~ "No-codes",
    regime == "sra" ~ "SRA",
    regime %in% c("lraYY", "lraNY", "lraYN") ~ "LRA-VHFHSZ"
  ))

# Get SRA/LRA border
# Load SRA/LRA buffer, anything in this area is within 1km of a border
# From 40-compute-sra-lra-buffer.R
borders <- qread(file.path(WORKING, "sra-lra-borders.qs"))

# Join to all borders, sort by border size, and then keep only the number of the closest border
homes <- homes %>%
  st_transform(st_crs(borders)) %>%
  st_join(borders, left = T) %>%
  arrange(FIPS, ImportParcelID, border_size) %>%
  distinct(FIPS, ImportParcelID, .keep_all = T)

# Generate variables for all potential borders
gen_border_variable <- function(x) {
  print(x)
  homes[[paste0("border_", x)]] <<- !is.na(homes$border_size) & homes$border_size <= x
  return(NULL)
}

invisible(lapply(unique(borders$border_size), gen_border_variable))
homes$border_size <- NULL

# Drop geometry
homes <- homes %>% st_drop_geometry()

# Save
write_fst(homes, file.path(WORKING, "statewide-sample.fst"))
